0. Project Setup¶
0.1 Packages & Device¶
# Torch
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, Subset, random_split
from tqdm import tqdm
# Augmentation
import albumentations as A
from albumentations.pytorch import ToTensorV2
import scipy.io as sio
# Visualize Result
from sklearn.metrics import (confusion_matrix, accuracy_score,
precision_score, recall_score,
f1_score, roc_auc_score,
roc_curve, auc, precision_recall_curve,
average_precision_score)
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import label_binarize
# Basic
import numpy as np
import cv2
import os
import time
from typing import List, Tuple, Union
import random
import itertools
import copy
E:\Courses\CISC3024-Pattern-Recognition\cisc3024_pr_venv\lib\site-packages\albumentations\__init__.py:13: UserWarning: A new version of Albumentations is available: 1.4.20 (you have 1.4.18). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1. check_for_updates()
device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
print(f"Using device: {device_name}")
Using device: cuda
0.2 Global Configurations¶
path_dataset = "./data/SVHN_mat"
1. Data Processing and Augmentation¶
1.1 Download Datasets¶
Define dataset class, retrieve dataset.
1.1 Notes¶
_dat = sio.loadmat(os.path.join(path_dataset, "train_32x32.mat"))
# _dat['X'][0][0][0]
# np.array(_dat).shape
dat = np.transpose(_dat['X'], (3, 0, 1, 2))
# dat = dat.astype(np.float32)
dat.shape
# First Image
dat[0].shape
# First Row of Image
dat[0][0]
# First Pixel of Image
dat[0][0][0]
_transform = A.Compose([
A.Normalize(mean=[0.4376845359802246, 0.4437684714794159, 0.47280389070510864], std=[0.19803018867969513, 0.2010156661272049, 0.19703581929206848]),
ToTensorV2()
])
_img = dat[0]
_img = _transform(image=_img)['image']
# print(_img)
_img.shape
1.2 Dataset¶
class SVHNDataset(Dataset):
def __init__(self, mat_file, transform=None):
data = sio.loadmat(mat_file)
self.images = np.transpose(data['X'], (3, 0, 1, 2))
self.labels = data['y'].flatten()
self.labels[self.labels == 10] = 0
self.transform = transform # Allow postponed injection of transform.
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
image = self.images[idx]
label = self.labels[idx]
# There should always be a transform.
# It converts image to float, and permutes it from (32, 32, 3) to Tensor([3, 32, 32]).
# ...which is important!!
if self.transform is None:
raise ValueError("CISC3024 Custom Error: The transform should not be None when this object is passed into a DataLoader.")
image = self.transform(image=image)['image']
return image, label
def get_meanstd(self, bias=None):
if bias is not None:
random_bias = random.randint(0, bias)
images_ = []
for i in range(len(self.images)):
image = self.images[i]
image = image.astype(np.int16)
image = (image + random_bias) % 256
image = image.astype(np.uint8)
images_.append(image)
images_ = np.array(images_)
else:
images_ = self.images
images_ = images_.astype(np.float32) / 255.0
mean = np.mean(images_, axis=(0,1,2))
std = np.std(images_, axis=(0,1,2), ddof=0)
return mean.tolist(), std.tolist()
def overwrite(self, indices:Union[list, np.ndarray]):
"""
Create a deep copy of the mother dataset instance and only keep the wanted
data samples, controlled by indices.
"""
if any(index < 0 or index >= len(self.labels) for index in indices):
raise IndexError("CISC3024 Custom Error: One or more indices are out of bounds.")
new_dataset = copy.deepcopy(self)
new_dataset.images = self.images[indices]
new_dataset.labels = self.labels[indices]
return new_dataset
1.2 Peak A Data¶
def peek(dataset):
def unnormalize(img, mean, std):
"""Revert the normalization for visualization."""
img = img * std + mean
return np.clip(img, 0, 1)
# Plotting multiple images in a grid
grid_rows, grid_cols = 1, 6
fig, axes = plt.subplots(grid_rows, grid_cols, figsize=(6, 6))
peek_index = random.randint(0, dataset.__len__()-1)
for i in range(grid_cols):
img_tensor, label = dataset.__getitem__(peek_index)
img = img_tensor.permute(1, 2, 0).numpy() # Convert to (H, W, C)
img = unnormalize(img, norm_mean, norm_std)
ax = axes[i] # Get subplot axis
ax.imshow(img)
ax.set_title(f"Label: {label}")
plt.tight_layout()
plt.show()
print(f"Peaking data from training set of index {peak_index}.\nImage Tnesor Size:{train_dataset.__getitem__(peak_index)[0].shape}")
2. Neural Network¶
2.1 Model Structure¶
class SmallVGG(nn.Module):
def __init__(self, frame_size=32):
super(SmallVGG, self).__init__()
self.frame_size = frame_size
self.conv_layers = nn.Sequential(
nn.Conv2d(3, 8, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(8, 16, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), # 16x16
nn.Conv2d(16, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), # 8x8
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), # 4x4
)
self.fc_layers = nn.Sequential(
nn.Linear(frame_size * 4 * 4, 256),
nn.ReLU(),
nn.Linear(256, 10)
)
def forward(self, x):
x = self.conv_layers(x)
x = x.view(x.size(0), -1)
x = self.fc_layers(x)
return x
2.2 Train and Evaluate Function¶
def train_and_evaluate(model,
train_loader,
valid_loader,
criterion,
optimizer,
num_epochs=100):
# Record Losses to plot
train_losses = []
valid_losses = []
for epoch in range(num_epochs):
# Train
model.train()
running_loss = 0.0
for images, labels in tqdm(train_loader):
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item() * len(images)
train_losses.append(running_loss / len(train_loader))
# Evaluate
model.eval()
valid_loss = 0.0
with torch.no_grad():
for images, labels in valid_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
valid_loss += loss.item() * len(images)
valid_losses.append(valid_loss / len(valid_loader))
print(f"Epoch[{epoch+1}/{num_epochs}], Train Loss:{train_losses[-1]:.4f}, Validation Loss:{valid_losses[-1]:.4f}")
return train_losses, valid_losses
2.3 Get Predictions¶
Multiple functions are defined to evaluate data. Below is a list of them.
def get_predictions(model_path, extra_loader):
if not isinstance(model_path, str):
model_state = model_path
else:
model_state = torch.load(model_path)
model = SmallVGG()
model.load_state_dict(model_state)
model.to(device)
model.eval()
pred_scores = [] # Prob. of predictions
true_labels = [] # Ground Truth
pred_labels = [] # Label of prediction, i.e., argmax(softmax(pred_scores))
with torch.no_grad():
for images, labels in tqdm(extra_loader):
images, labels = images.to(device), labels.to(device)
outputs = model(images)
pred_scores_batch = nn.functional.softmax(outputs, dim=-1)
pred_scores.extend(pred_scores_batch.cpu().tolist())
pred_labels.extend(outputs.argmax(dim=1).tolist())
true_labels.extend(labels.cpu().tolist())
return pred_scores, true_labels, pred_labels
2.4 Get Metrics¶
def get_metrics(true_labels, pred_labels):
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels, zero_division=1, average=None, labels=range(0,10))
recall = recall_score(true_labels, pred_labels, zero_division=1, average=None, labels=range(0,10))
f1 = f1_score(true_labels, pred_labels, zero_division=0, average=None, labels=range(0,10))
return accuracy, precision, recall, f1
def print_metrics(accuracies, f1s):
print(f"Accuracies:")
for acc in accuracies:
print(f"{acc:.3f}", end=" ")
print("\n")
print(f"F1 Score Lists:")
mean_f1s = []
for f1 in f1s:
for val in f1:
print(f"{val:.3f}", end=" ")
mean_f1 = np.mean(f1)
std_f1 = np.std(f1)
mean_f1s.append(mean_f1)
print(f"| Avg F1={mean_f1:.3f}, Std F1={std_f1}")
print(f"Best: {np.argmax(mean_f1s)+1}-th")
# Compute ROC AUC for each class
def get_roc_auc(true_labels_bin, pred_labels_bin):
roc_auc = dict()
for i in range(0, 10):
roc_auc[i] = roc_auc_score(true_labels_bin[:,i], np.array(pred_scores)[:, i])
return roc_auc
3. Experiments¶
3.0 Preparation¶
3.0.1 Plot Functions¶
The experiments will be a list of the following structures:
{
"HYPER_PARAM_1": combo[0],
"HYPER_PARAM_2": combo[1],
"train_losses": train_losses,
"valid_losses": valid_losses,
"model_state_dict": exp_model.state_dict()
}
Epoch-Loss Curves¶
def plot_el(loaded_experiments, hyper_param_names, n_rows=4, n_cols=4):
fig_size = (n_cols * 5, n_rows * 5)
n1, n2 = hyper_param_names
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=fig_size)
# plt.tight_layout()
for i, ax in enumerate(axes.flat):
train_losses, valid_losses = loaded_experiments[i]["train_losses"], loaded_experiments[i]["valid_losses"]
ax.plot(train_losses, label=f"TRL, min={np.min(train_losses):.3f}")
ax.plot(valid_losses, label=f"VAL, min={np.min(valid_losses):.3f} at step={np.argmin(valid_losses)}")
ax.set_xlabel("Epochs")
ax.set_ylabel("Loss")
ax.set_title(f"{n1}={loaded_experiments[i][n1]}, {n2}={loaded_experiments[i][n2]}")
ax.legend(loc="upper right")
plt.show()
Get Experiment Results¶
def get_experiment_results(loaded_experiments, test_hyperparam_names, extra_loader):
experiment_results = []
n1, n2 = test_hyperparam_names
for i, exp in enumerate(loaded_experiments):
pred_scores, true_labels, pred_labels = get_predictions(exp['model_state_dict'], extra_loader)
experiment_results.append({
n1: exp[n1],
n2: exp[n2],
"true_labels": true_labels,
"pred_labels": pred_labels,
"pred_scores": pred_scores
})
print(f"First 10 true labels:")
[print(num, end=" ") for num in true_labels[:10]]
print(f"...\n")
print(f"First 10 pred labels:")
[print(num, end=" ") for num in pred_labels[:10]]
print(f"...\n")
print(f"First 5 pred_scores:")
[print(num, end=" ") for num in pred_scores[:5]]
print(f"...\n")
# del pred_scores, true_labels, pred_lables
torch.cuda.empty_cache()
return experiment_results
Confusion Matrix¶
def plot_cm(experiment_results, hyper_param_names, n_rows=4, n_cols=4):
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 5))
axes = axes.flatten()
hparam_1, hparam_2 = hyper_param_names
for i, exp_rs in enumerate(experiment_results):
true_labels, pred_labels = exp_rs['true_labels'], exp_rs['pred_labels']
cm = confusion_matrix(true_labels, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=range(0,10))
disp.plot(ax=axes[i], cmap = plt.cm.Blues)
axes[i].set_title(f"Exp {i+1}: {hparam_1}={exp_rs[hparam_1]}, {hparam_2}={exp_rs[hparam_2]}")
plt.tight_layout()
plt.show()
Precision-Recall Curve¶
def plot_pr(experiment_results, hyper_param_names, n_rows=4, n_cols=4):
fig, axes = plt.subplots(n_rows,n_cols, figsize=(n_cols * 5, n_rows * 5))
axes = axes.flatten()
hparam_1, hparam_2 = hyper_param_names
accuracies = []
f1_scores = []
for i, exp_rs in enumerate(experiment_results):
true_labels, pred_labels, pred_scores = exp_rs['true_labels'], exp_rs['pred_labels'], exp_rs['pred_scores']
true_labels_bin, pred_labels_bin = label_binarize(true_labels, classes=range(0,10)), label_binarize(pred_labels, classes=range(0,10))
accuracy, precision, recall, f1 = get_metrics(true_labels, pred_labels)
accuracies.append(accuracy)
f1_scores.append(f1)
for j in range(0, 10):
# print(f"Class {j}: Prec:{precision[j]:.2f}, Recall:{recall[j]:.2f}, F_1 Score:{f1[j]:.2f}")
precision_i, recall_i, _ = precision_recall_curve(true_labels_bin[:, j], np.array(pred_scores)[:, j])
average_precision = average_precision_score(true_labels_bin[:, j], np.array(pred_scores)[:, j])
axes[i].step(recall_i, precision_i, where="post", label=f"Class {j} AP={average_precision:.2f}")
axes[i].set_title(f"PR-Curve {hparam_1}={exp_rs[hparam_1]}, {hparam_2}={exp_rs[hparam_2]}")
axes[i].legend()
axes[i].set_xlabel("Recall")
axes[i].set_ylabel("Precision")
# for j in range(i+1, 16):
# fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
return accuracies, f1_scores
ROC-AUC Curve¶
def plot_rocauc(experiment_results, hyper_param_names, curve_type, n_rows=4, n_cols=4):
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 5))
axes = axes.flatten()
hparam_1, hparam_2 = hyper_param_names
for i, exp_rs in enumerate(experiment_results):
true_labels, pred_scores = exp_rs['true_labels'], exp_rs['pred_scores']
true_labels_bin = label_binarize(true_labels, classes=range(0, 10))
# All Classes' ROC curve & ROC Area Under Curve
fpr = dict()
tpr = dict()
roc_auc = dict()
for j in range(10):
fpr[j], tpr[j], _ = roc_curve(true_labels_bin[:, j], np.array(pred_scores)[:, j])
roc_auc[j] = auc(fpr[j], tpr[j])
# Macro-Average ROC & ROC-AUC
all_fpr = np.unique(np.concatenate([fpr[j] for j in range(10)]))
mean_tpr = np.zeros_like(all_fpr)
for j in range(10):
mean_tpr += np.interp(all_fpr, fpr[j], tpr[j])
mean_tpr /= 10
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(true_labels_bin.ravel(), np.array(pred_scores).ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Plot only Macro or Micro ROC curves
if curve_type == "macro_micro":
axes[i].plot(fpr["macro"], tpr["macro"], label=f"Macro (AUC={roc_auc['macro']:.2f})")
axes[i].plot(fpr["micro"], tpr["micro"], label=f"Micro (AUC={roc_auc['micro']:.2f})")
elif curve_type == "all":
# Plot all ROC curves
for j in range(10):
axes[i].plot(fpr[j], tpr[j], label=f"Class {j} (AUC={roc_auc[j]:.2f})")
axes[i].plot([0, 1], [0, 1], "k--")
axes[i].set_xlabel("False Positive Rate")
axes[i].set_ylabel("True Positive Rate")
axes[i].set_title(f"ROC Curve {i+1}, {hparam_1}={exp_rs[hparam_1]}, {hparam_2}={exp_rs[hparam_2]}")
axes[i].legend(loc='lower right')
plt.tight_layout()
plt.show()
3.0.1 Datasets¶
def split_train_valid(train_dataset, train_ratio):
ori_len = len(train_dataset)
train_size = int(train_ratio * ori_len)
valid_size = ori_len - train_size
# These are subsets!! Don't directly use them or you will spend 2 hours solving for it.
train_subset, valid_subset = random_split(train_dataset, [train_size, valid_size])
# Re-construct two SVHNDataset object from indecies
train_dataset_ = train_dataset.overwrite(indices=train_subset.indices)
valid_dataset_ = train_dataset.overwrite(indices=valid_subset.indices)
return train_dataset_, valid_dataset_
3.1 Experiment 1: Optimizer¶
In the standard process of gradient descent, each update is proportional to the negative gradient (first-order derivative) of the loss function with respect to the parameter. In this traditional process, the learning rate is fixed, and it may cause problems.
- Oscillations. If locally, the learning rate is too high, the model will jump around the local minimum.
- Slow convergence. If locally, the learning rate is too low, the model will spend a lot of epochs to converge to a local minimum.
To solve this problem, we enable the learning rate to be adaptive by introducing the "momentum", a velocity-like term which accumulates past gradients in the direction of consistent descent.
- The velocity term is the weighted sum of previous gradients.
- ...such that the update direction does not only rely on the current gradient, but also on previous ones.
The update of velocity is represnted as: $$ v_t=\beta v_{t-1} + (1-\beta)\cdot\nabla J(\theta) $$ where $\beta$ is the momentum coefficient. In our experiments, $\beta$ will be fixed to $0.9$.
The update of parameters will be: $$ \theta_{t} = \theta_{t-1}-\eta\cdot v_{t} $$ In this experiment, we focus on the performance of different optimizers, each has its own optimized way to update the momentum. We will fix other variables, including transform, epoch number and learning rate, and only adjust the optimizers. There are a few optimizers to be chozen:
- Adaptive Moment Estimation (Adam)
- Stochastic Gradient Descent (SGD)
- Root Mean Square Propagation (RMSprop)
- Adam with Weight Decay (AdamW)
- Adaptive Gradient Algorithm (Adgrad)
- SGD with Momentum and Nesterove Accelerated Gradient
# Universal Train Dataset without splitting
exp1_universal_train_dataset = SVHNDataset(mat_file=os.path.join(path_dataset,"train_32x32.mat"))
exp1_mean, exp1_std = exp1_universal_train_dataset.get_meanstd()
print(f"Channel Means: {exp1_mean}")
print(f"Channel Stds: {exp1_std}")
Channel Means: [0.4376845359802246, 0.4437684714794159, 0.47280389070510864] Channel Stds: [0.19803018867969513, 0.2010156661272049, 0.19703581929206848]
Define changing & non-changing hyper parameters.
exp1_hyperparams = {
"num_epochs": 25,
"lr": 1e-3,
"criterion": nn.CrossEntropyLoss(),
"transform": A.Compose([
A.Normalize(mean=exp1_mean, std=exp1_std),
ToTensorV2()
])
}
exp1_models = [SmallVGG().to(device) for _ in range(0,6)]
candidate_optimizers = [
optim.Adam(exp1_models[0].parameters(), lr=exp1_hyperparams['lr']),
optim.SGD(exp1_models[1].parameters(), lr=exp1_hyperparams['lr'], momentum=0.9),
optim.RMSprop(exp1_models[2].parameters(), lr=exp1_hyperparams['lr']),
optim.AdamW(exp1_models[3].parameters(), lr=exp1_hyperparams['lr'], weight_decay=0.01),
optim.Adagrad(exp1_models[4].parameters(), lr=exp1_hyperparams['lr']),
optim.SGD(exp1_models[5].parameters(), lr=exp1_hyperparams['lr'], momentum=0.9, nesterov=True)]
Train, Validation and Test datasets.
# Train & Test Dataset
exp1_train_dataset = SVHNDataset(mat_file=os.path.join(path_dataset,"train_32x32.mat"), transform=exp1_hyperparams['transform'])
exp1_train_dataset, exp1_valid_dataset = split_train_valid(exp1_train_dataset, train_ratio=0.8)
# Test Dataset
exp1_test_dataset = SVHNDataset(mat_file=os.path.join(path_dataset,"test_32x32.mat"), transform=exp1_hyperparams['transform'])
print(f"Train Size:{exp1_train_dataset.__len__()}\nValidation Size:{exp1_valid_dataset.__len__()}\nTest Size:{exp1_test_dataset.__len__()}")
Train Size:58605 Validation Size:14652 Test Size:26032
Train, Validation and Test Data Loaders.
# Data Loaders
exp1_train_loader = DataLoader(exp1_train_dataset, batch_size=128, shuffle=True)
exp1_valid_loader = DataLoader(exp1_valid_dataset, batch_size=128, shuffle=True)
exp1_test_loader = DataLoader(exp1_test_dataset, batch_size=128, shuffle=False)
Run Experiments
def run_exp1(optimizers, models, hyper_params, train_loader, valid_loader):
experiments = []
for i, [optimizer, exp1_model] in enumerate(zip(optimizers, models)):
print(f"Experiment {i+1}. Running experiment on optimizer: {optimizer.__class__.__name__}")
criterion = hyper_params['criterion']
num_epochs = hyper_params['num_epochs']
train_losses, valid_losses = train_and_evaluate(exp1_model, train_loader, valid_loader, criterion, optimizer, num_epochs)
experiments.append({
"optimizer": optimizer.__class__.__name__,
"others":"same",
"train_losses": train_losses,
"valid_losses": valid_losses,
"model_state_dict": exp1_model.state_dict()
})
del exp1_model, criterion, optimizer
torch.cuda.empty_cache()
return experiments
exp1 = run_exp1(candidate_optimizers, exp1_models, exp1_hyperparams, exp1_train_loader, exp1_valid_loader)
time_str = str(time.time()).replace(".","")
torch.save(exp1, f"./models/exp1_{time_str}.pth")
Load Experiments
Load Experiment objects and plot results.
exp1_loaded = torch.load("./models/exp1_17303963644961627.pth")
exp1_results = get_experiment_results(exp1_loaded, test_hyperparam_names=["optimizer", "others"], extra_loader=exp1_test_loader)
plot_el(exp1_loaded, ["optimizer", "others"], n_rows=1, n_cols=6)
plot_cm(exp1_results, ["optimizer", "others"], n_rows=1, n_cols=6)
Precision-Recall Curve
Overfitted to inspect training performance. This "Best" does not tell the optimum optimizer.
exp1_accuracies, exp1_f1s = plot_pr(exp1_results, ["optimizer", "others"], n_rows=1, n_cols=6)
print_metrics(exp1_accuracies, exp1_f1s)
Accuracies: 0.907 0.196 0.910 0.908 0.791 0.196 F1 Score Lists: 0.910 0.941 0.933 0.863 0.927 0.911 0.883 0.899 0.859 0.861 | Avg F1=0.899, Std F1=0.029276089747144043 0.000 0.328 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 | Avg F1=0.033, Std F1=0.09827503131926378 0.910 0.937 0.939 0.877 0.909 0.915 0.897 0.922 0.860 0.861 | Avg F1=0.903, Std F1=0.027107602833846337 0.909 0.941 0.944 0.865 0.923 0.911 0.891 0.918 0.855 0.833 | Avg F1=0.899, Std F1=0.035261349926122475 0.767 0.871 0.856 0.744 0.790 0.759 0.750 0.805 0.647 0.689 | Avg F1=0.768, Std F1=0.06501732647978666 0.000 0.328 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 | Avg F1=0.033, Std F1=0.09827503131926378 Best: 3-th
plot_rocauc(exp1_results, ["optimizer", "others"], curve_type="all", n_rows=1, n_cols=6)
plot_rocauc(exp1_results, ["optimizer", "others"], curve_type="macro_micro", n_rows=1, n_cols=6)
3.2 Experiment 2: Epoch Number and Learning Rate¶
This experiment seeks to find the effect of different combinations of epoch numbers and learning rates on the training & testing performance of the neural network.
3.2.1 Experiment 2-1: Rough Search¶
In this sub-experiment, we perform a rough search on the epochs and learning rate. We promoted four possible values for both parameters: $$ \text{candidate epochs}=\{10, 15, 20, 25\} $$ $$ \text{candidate lr}=\{1.0\times 10^{-3},1.0\times 10^{-4},1.0\times 10^{-5},1.0\times 10^{-6}\} $$
# Universal Train Dataset without splitting
exp2_universal_train_dataset = SVHNDataset(mat_file=os.path.join(path_dataset,"train_32x32.mat"))
exp2_mean, exp2_std = exp2_universal_train_dataset.get_meanstd()
print(f"Channel Means: {exp2_mean}")
print(f"Channel Stds: {exp2_std}")
Channel Means: [0.4376845359802246, 0.4437684714794159, 0.47280389070510864] Channel Stds: [0.19803018867969513, 0.2010156661272049, 0.19703581929206848]
exp2_hyperparams = {
"criterion": nn.CrossEntropyLoss(),
"transform": A.Compose([
A.Normalize(mean=exp2_mean, std=exp2_std),
ToTensorV2()
]),
"optimizer":optim.AdamW,
}
candidate_epochs = [10, 15, 20, 25]
candidate_lr = [1e-3, 1e-4, 1e-5, 1e-6]
torch.cuda.empty_cache()
# Train & Validation Datasets
exp2_train_dataset = SVHNDataset(mat_file=os.path.join(path_dataset,"train_32x32.mat"), transform=exp2_hyperparams['transform'])
exp2_train_dataset, exp2_valid_dataset = split_train_valid(exp2_train_dataset, train_ratio=0.8)
# Test Dataset
exp2_test_dataset = SVHNDataset(mat_file=os.path.join(path_dataset,"test_32x32.mat"), transform=exp2_hyperparams['transform'])
print(f"Train Size:{exp2_train_dataset.__len__()}\nValidation Size:{exp2_valid_dataset.__len__()}\nTest Size:{exp2_test_dataset.__len__()}")
Train Size:58605 Validation Size:14652 Test Size:26032
exp2_train_loader = DataLoader(exp2_train_dataset, batch_size=128, shuffle=True)
exp2_valid_loader = DataLoader(exp2_valid_dataset, batch_size=128, shuffle=True)
exp2_test_loader = DataLoader(exp2_test_dataset, batch_size=128, shuffle=False)
def run_exp2_1(epochs, lr_list, hyper_params, train_loader, valid_loader):
combinations = list(itertools.product(epochs, lr_list))
experiments = []
for i, combo in enumerate(combinations):
num_epochs, lr = combo
print(f"Running Exp {i+1}: num_epoch={num_epochs}, lr={lr}")
this_model = SmallVGG().to(device)
criterion = hyper_params['criterion']
optimizer = hyper_params['optimizer'](this_model.parameters(), lr=lr, weight_decay=0.01)
train_losses, valid_losses = train_and_evaluate(this_model, train_loader, valid_loader, criterion, optimizer, num_epochs)
experiments.append({
"num_epochs": num_epochs,
"lr": lr,
"train_losses": train_losses,
"valid_losses": valid_losses,
"model_state_dict": this_model.state_dict()
})
del this_model, criterion, optimizer
return experiments
exp2_1 = run_exp2_1(candidate_epochs, candidate_lr, exp2_hyperparams, exp2_train_loader, exp2_valid_loader)
time_str = str(time.time()).replace(".","")
torch.save(exp2_1, f"./models/exp2-1_{time_str}.pth")
exp2_1_loaded = torch.load("./models/exp2-1_17304023422640018.pth")
exp2_1_results = get_experiment_results(exp2_1_loaded, test_hyperparam_names=["num_epochs", "lr"], extra_loader=exp2_test_loader)
3.2.1-1 Epoch-Loss Curve¶
We found that the key to the training performance of a model is the learning rate. Epoch number only controls the progress of training.
From the perspective of learning rate (each column), only the learning rate of $1.0\times 10^{-3}$ shows a sign of convergence under each candidate epochs. With this learning rate, the model even overfitted under experiments with an epoch number over $15$. The best model we conclude from this rough selection is the one with the combination of $\text{num\_epoch}=10\land\text{lr}=1.0\times10^{-3}$. The minimum validation loss is $36.648$ at step $7$, which is the lowest of all $16$ samples. However, this doesn't mean that it is optimal since it may jump over a local minimum.
Moreover, as we inspect the performance on smaller learning rates, we found that they tend to converge in a way further epoch steps. Moreover, for the learning rate $1.0\times 10^{-6}$, the learning rate is too low that the model can not even fit under nearly-finite epochs.
plot_el(exp2_1_loaded, ["num_epochs", "lr"], n_rows=4, n_cols=4)
3.2.1-2 Confusion Matrix¶
In this rough search, the confusion matrix varies on different learning rates, and tends to be identical on different epochs.
Under a same epoch number, as leraning rate gets smaller, the confusion matrix gets "blurrer", meaning that the prediction is less accurate from the whole perspective. The learning rates under $1.0\times 10^{-5}$ are too low that the model can't converge in a reasonably number of epochs. For the lowest learning rate of $1.0\times 10^{-6}$, the model is not fitted at all. It classifies every number into 1, the number with the richest amount in the dataset.
plot_cm(exp2_1_results, ["num_epochs", "lr"], n_rows=4, n_cols=4)
3.2.1-3 Precision-Recall Curve¶
From a numerical perspective over the testing performance, the combination of $\text{num\_epoch}=15\land\text{lr}=1.0\times10^{-3}$ gives the highest accuracy of $0.907$, highest average $F_1$ score of $0.916$ and the lowest $F_1$ variance per-class of $0.025$.
exp2_1_accuracies, exp2_1_f1s = plot_pr(exp2_1_results, ["num_epochs", "lr"], n_rows=4, n_cols=4)
print_metrics(exp2_1_accuracies, exp2_1_f1s)
Accuracies: 0.915 0.836 0.486 0.196 0.914 0.865 0.520 0.196 0.913 0.876 0.589 0.196 0.902 0.880 0.636 0.196 F1 Score Lists: 0.906 0.940 0.939 0.888 0.924 0.903 0.887 0.929 0.873 0.887 | Avg F1=0.908, Std F1=0.022740785980864705 0.827 0.914 0.886 0.770 0.857 0.805 0.771 0.882 0.717 0.731 | Avg F1=0.816, Std F1=0.06475924928424134 0.346 0.740 0.514 0.354 0.461 0.447 0.228 0.557 0.013 0.041 | Avg F1=0.370, Std F1=0.2152076147547944 0.000 0.328 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 | Avg F1=0.033, Std F1=0.09827503131926378 0.920 0.942 0.936 0.884 0.932 0.914 0.891 0.913 0.865 0.872 | Avg F1=0.907, Std F1=0.02589773808894772 0.874 0.925 0.913 0.817 0.878 0.835 0.817 0.897 0.770 0.769 | Avg F1=0.849, Std F1=0.05343572312454684 0.356 0.741 0.597 0.330 0.575 0.494 0.377 0.615 0.006 0.213 | Avg F1=0.430, Std F1=0.2071013592550223 0.000 0.328 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 | Avg F1=0.033, Std F1=0.09827503131926378 0.908 0.941 0.945 0.885 0.929 0.907 0.884 0.929 0.856 0.852 | Avg F1=0.904, Std F1=0.031728876018262464 0.868 0.933 0.924 0.832 0.894 0.857 0.830 0.900 0.787 0.794 | Avg F1=0.862, Std F1=0.04870114698696799 0.464 0.734 0.751 0.441 0.505 0.559 0.523 0.638 0.210 0.434 | Avg F1=0.526, Std F1=0.1507201592342292 0.000 0.328 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 | Avg F1=0.033, Std F1=0.09827503131926378 0.897 0.934 0.937 0.860 0.918 0.898 0.878 0.914 0.847 0.845 | Avg F1=0.893, Std F1=0.03238960226204535 0.885 0.935 0.917 0.823 0.896 0.860 0.844 0.898 0.824 0.796 | Avg F1=0.868, Std F1=0.04325231411216343 0.603 0.852 0.687 0.504 0.652 0.617 0.518 0.707 0.228 0.292 | Avg F1=0.566, Std F1=0.17967335896170858 0.000 0.328 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 | Avg F1=0.033, Std F1=0.09827503131926378 Best: 1-th
plot_rocauc(exp2_1_results, ["num_epochs", "lr"], curve_type="all", n_rows=4, n_cols=4)
plot_rocauc(exp2_1_results, ["num_epochs", "lr"], curve_type="macro_micro", n_rows=4, n_cols=4)
3.2.2 Experiment 2-2: Detailed¶
Previous sub-experiment tells that the best combination from all the listed ones is $\text{num\_epoch}=15 \land \text{lr}=1.0\times 10^{-3}$.
This is a rough solution, as it may jump over local minimums. We want to find a better learning rate around $1.0\times 10^{-3}$, with an even more detailed distinction between candidate values, so that it may reveal a missing local minimum without using too much epochs.
We conducted an excessive experiment, purposely seeking an overfitting point over the listed candidate learning rates. We do this by setting the epoch number to $50$.
exp2_2_hyperparams = {
"num_epoch": 20,
"criterion": nn.CrossEntropyLoss(),
"transform": A.Compose([
A.Normalize(mean=exp2_mean, std=exp2_std),
ToTensorV2()
]),
"optimizer":optim.Adam,
}
# More detailed candidate learning rates around 1e-3, that is 10e-4.
exp2_2_candidate_lr = [17e-4, 15e-4, 15e-4, 14e-4, 13e-4, 12e-4, 8e-4, 7e-4, 6e-4, 5e-4, 4e-4, 3e-4]
def run_exp2_2(lr_list, hyper_params, train_loader, test_loader):
experiments = []
for i, lr in enumerate(lr_list):
print(f"Running Exp {i+1}: lr={lr}")
this_model = SmallVGG().to(device)
num_epochs = hyper_params['num_epoch']
criterion = hyper_params['criterion']
optimizer = hyper_params['optimizer'](this_model.parameters(), lr=lr, weight_decay=0.01)
train_losses, valid_losses = train_and_evaluate(this_model, train_loader, test_loader, criterion, optimizer, num_epochs)
experiments.append({
"num_epochs": num_epochs,
"lr": lr,
"train_losses": train_losses,
"valid_losses": valid_losses,
"model_state_dict": this_model.state_dict()
})
del this_model, criterion, optimizer
return experiments
exp2_2 = run_exp2_2(exp2_2_candidate_lr, exp2_2_hyperparams, exp2_train_loader, exp2_valid_loader)
time_str = str(time.time()).replace(".", "")
torch.save(exp2_2, f"./models/exp2-2_{time_str}.pth")
exp2_2_loaded = torch.load("./models/exp2-2_17304071314112995.pth")
exp2_2_results = get_experiment_results(exp2_2_loaded, test_hyperparam_names=["num_epochs", "lr"], extra_loader=exp2_test_loader)
3.2.2-1 Epoch-Loss Curve¶
By inspecting the epoch-loss curve, we found that all the experiments are overfitted. This means that $50$ epochs are enough for conducting the detailed search.
From all the detailed searches, the learning rate of $14\times10^{-4}$, that is 1.4e-3, yields the lowest validation loss of $36.259$ at step $5$, which is the overfitting point. We discovered a new local minimum that's been jumped over by learning rate of 1e-3, which previously yield a validation loss of $36.648$.
plot_el(exp2_2_loaded, ["num_epochs", "lr"], n_rows=2, n_cols=6)
3.2.2-2 Confusion Matrix¶
At a glance, from the perspective of confusion matrix, the testing performance on unknown data is roughly identical.
plot_cm(exp2_2_results, ["num_epochs", "lr"], n_rows=2, n_cols=6)
3.2.2-3 Precision-Recall Curve¶
By inspecting the evaluation metrics, we found our judgement correct. From all the over-fitted model, the model with learning rate of $7.0\times 10^{-4}$ yields the highest accuracy of $0.909$ and the highest average per-class $F_1$ score of $0.903$. Besides, the per-class $F_1$ score is also less variant under the learning rate of $7.0\times 10^{-4}$, with the standard deviation of $0.023$.
exp2_2_accuracies, exp2_2_f1s = plot_pr(exp2_2_results, ["num_epochs", "lr"], n_rows=2, n_cols=6)
print_metrics(exp2_2_accuracies, exp2_2_f1s)
Accuracies: 0.905 0.909 0.905 0.896 0.904 0.908 0.904 0.906 0.900 0.904 0.901 0.892 F1 Score Lists: 0.904 0.931 0.936 0.879 0.923 0.899 0.865 0.912 0.840 0.881 | Avg F1=0.897, Std F1=0.029019711710490413 0.902 0.939 0.937 0.877 0.929 0.897 0.884 0.917 0.854 0.879 | Avg F1=0.901, Std F1=0.027249619260366488 0.892 0.935 0.933 0.874 0.926 0.916 0.874 0.908 0.836 0.859 | Avg F1=0.895, Std F1=0.032047660610461416 0.895 0.928 0.933 0.856 0.923 0.901 0.872 0.881 0.836 0.841 | Avg F1=0.887, Std F1=0.033520138553783996 0.901 0.936 0.932 0.872 0.926 0.905 0.876 0.909 0.844 0.845 | Avg F1=0.895, Std F1=0.03214838081150217 0.909 0.930 0.935 0.879 0.926 0.911 0.885 0.912 0.863 0.863 | Avg F1=0.901, Std F1=0.025612410587384667 0.904 0.938 0.938 0.866 0.926 0.893 0.881 0.910 0.847 0.837 | Avg F1=0.894, Std F1=0.034195826718722434 0.901 0.934 0.941 0.873 0.922 0.913 0.880 0.903 0.848 0.850 | Avg F1=0.897, Std F1=0.03125728672040046 0.908 0.933 0.928 0.870 0.919 0.892 0.869 0.880 0.855 0.866 | Avg F1=0.892, Std F1=0.02681609191626165 0.907 0.937 0.943 0.870 0.919 0.906 0.872 0.913 0.852 0.819 | Avg F1=0.894, Std F1=0.037298183800018245 0.896 0.937 0.935 0.868 0.912 0.898 0.865 0.910 0.849 0.844 | Avg F1=0.892, Std F1=0.031961893604000764 0.903 0.926 0.912 0.853 0.913 0.889 0.873 0.882 0.841 0.852 | Avg F1=0.884, Std F1=0.02792655014372886 Best: 2-th
3.2.2-4 ROC-AUC Curve¶
The ROC-AUC Curve under all the detailed candidate learning rates are roughly identical.
plot_rocauc(exp2_2_results, ["num_epochs", "lr"], curve_type="all", n_rows=2, n_cols=6)
plot_rocauc(exp2_2_results, ["num_epochs", "lr"], curve_type="macro_micro", n_rows=2, n_cols=6)
3.3 Experiment 3: Image Augmentation Parameters¶
3.3.1 Experiment 3-1: Rotation Angles and Crop Ratios¶
exp3_universal_train_dataset = SVHNDataset(mat_file=os.path.join(path_dataset, "train_32x32.mat"))
# The mean & std here will only be used for experiment 3-1.
exp3_1_mean, exp3_1_std = exp3_universal_train_dataset.get_meanstd()
print(f"Channel Means: {exp3_1_mean}\nChannel Stds: {exp3_1_std}")
Channel Means: [0.4376845359802246, 0.4437684714794159, 0.47280389070510864] Channel Stds: [0.19803018867969513, 0.2010156661272049, 0.19703581929206848]
exp3_1_hyperparams = {
"num_epochs": 10,
"lr": 5e-4,
"criterion": nn.CrossEntropyLoss(),
"optimizer": optim.AdamW,
"transform": A.Compose([
A.Normalize(mean=exp1_mean, std=exp1_std),
ToTensorV2()
])
}
# Group 1
candidate_angles = [15, 30, 45, 60]
candidate_crops = [0.08, 0.24, 0.40, 0.60] # Left Boundary
torch.cuda.empty_cache()
# Train & Validation Datasets
exp3_train_dataset = SVHNDataset(mat_file=os.path.join(path_dataset, "train_32x32.mat"))
exp3_train_dataset, exp3_valid_dataset = split_train_valid(exp3_train_dataset, train_ratio=0.8)
# Test Dataset
exp3_test_dataset = SVHNDataset(mat_file=os.path.join(path_dataset, "test_32x32.mat"), transform=exp3_1_hyperparams["transform"])
print(f"Train Size:{exp3_train_dataset.__len__()}\nValidation Size:{exp3_valid_dataset.__len__()}\nTest Size:{exp3_test_dataset.__len__()}")
Train Size:58605 Validation Size:14652 Test Size:26032
exp3_test_loader = DataLoader(exp3_test_dataset, batch_size=128, shuffle=False)
def run_exp3_1(angles, crops, hyper_params, train_dataset, valid_dataset):
combinations = list(itertools.product(angles, crops))
experiments = []
for i, combo in enumerate(combinations):
angle, crop = combo
print(f"Running Exp {i+1}: angles={angle}, crop={crop}")
this_model = SmallVGG().to(device)
num_epochs = hyper_params['num_epochs']
lr = hyper_params['lr']
criterion = hyper_params['criterion']
optimizer = hyper_params['optimizer'](this_model.parameters(), lr=lr, weight_decay=0.01)
# Define Transform
this_transform = A.Compose([
A.RandomResizedCrop(32, 32, scale=(crop, 1.0)),
A.Rotate(limit=angle),
A.Normalize(mean=exp3_1_mean, std=exp3_1_std),
ToTensorV2()
])
# Generate Dataset
print(f"Exp {i+1}: Generating dataset from transform")
train_dataset.transform = this_transform
valid_dataset.transform = this_transform
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)
# Train Model
train_losses, valid_losses = train_and_evaluate(this_model,
train_loader,
valid_loader,
criterion,
optimizer,
num_epochs)
experiments.append({
"angle": angle,
"crop": crop,
"train_losses": train_losses,
"valid_losses": valid_losses,
"model_state_dict": this_model.state_dict()
})
del this_model, criterion, optimizer
del train_loader, valid_loader
torch.cuda.empty_cache()
return experiments
exp3_1 = run_exp3_1(candidate_angles, candidate_crops, exp3_1_hyperparams, exp3_train_dataset, exp3_valid_dataset)
time_str = str(time.time()).replace(".","")
torch.save(exp3_1, f"./models/exp3-1_{time_str}.pth")
exp3_1_loaded = torch.load("./models/exp3-1_1730412367135605.pth")
exp3_1_results = get_experiment_results(exp3_1_loaded, test_hyperparam_names=["angle", "crop"], extra_loader=exp3_test_loader)
plot_el(exp3_1_loaded, ["angle", "crop"], n_rows=4, n_cols=4)
plot_cm(exp3_1_results, ["angle", "crop"], n_rows=4, n_cols=4)
exp3_accuracies, exp3_f1s = plot_pr(exp3_1_results, ["angle", "crop"], n_rows=4, n_cols=4)
print_metrics(exp3_accuracies, exp3_f1s)
Accuracies: 0.861 0.868 0.877 0.903 0.852 0.865 0.881 0.906 0.841 0.873 0.874 0.897 0.858 0.867 0.870 0.897 F1 Score Lists: 0.841 0.921 0.911 0.797 0.878 0.871 0.827 0.904 0.725 0.797 | Avg F1=0.847, Std F1=0.058953708849960075 0.810 0.920 0.906 0.816 0.911 0.884 0.839 0.917 0.745 0.796 | Avg F1=0.854, Std F1=0.05837212888352291 0.868 0.914 0.910 0.824 0.922 0.889 0.846 0.866 0.800 0.826 | Avg F1=0.867, Std F1=0.04009923059871438 0.882 0.940 0.944 0.872 0.910 0.914 0.869 0.925 0.829 0.825 | Avg F1=0.891, Std F1=0.040588392995837934 0.834 0.912 0.893 0.784 0.899 0.851 0.808 0.875 0.738 0.782 | Avg F1=0.837, Std F1=0.055312248934860725 0.851 0.920 0.911 0.807 0.897 0.855 0.806 0.893 0.763 0.808 | Avg F1=0.851, Std F1=0.050846063403817476 0.873 0.922 0.917 0.821 0.916 0.900 0.861 0.888 0.801 0.799 | Avg F1=0.870, Std F1=0.0453468039810251 0.891 0.930 0.941 0.876 0.926 0.917 0.882 0.905 0.846 0.848 | Avg F1=0.896, Std F1=0.03164409892053704 0.841 0.907 0.888 0.768 0.875 0.845 0.818 0.866 0.671 0.749 | Avg F1=0.823, Std F1=0.0694711993348085 0.847 0.920 0.922 0.833 0.902 0.870 0.832 0.882 0.776 0.787 | Avg F1=0.857, Std F1=0.04882954639950114 0.861 0.914 0.912 0.828 0.899 0.879 0.851 0.868 0.796 0.811 | Avg F1=0.862, Std F1=0.03895370150699186 0.878 0.932 0.928 0.859 0.935 0.903 0.875 0.898 0.845 0.819 | Avg F1=0.887, Std F1=0.03714507797632649 0.811 0.926 0.899 0.800 0.872 0.872 0.824 0.899 0.748 0.759 | Avg F1=0.841, Std F1=0.058551706311618164 0.826 0.926 0.909 0.822 0.892 0.864 0.816 0.902 0.762 0.804 | Avg F1=0.852, Std F1=0.051366126058318876 0.832 0.932 0.900 0.831 0.905 0.865 0.809 0.906 0.766 0.796 | Avg F1=0.854, Std F1=0.052532712015071564 0.883 0.936 0.927 0.855 0.915 0.905 0.875 0.906 0.837 0.827 | Avg F1=0.887, Std F1=0.035692420818319505 Best: 8-th
plot_rocauc(exp3_1_results, ["angle", "crop"], curve_type="all", n_rows=4, n_cols=4)
plot_rocauc(exp3_1_results, ["angle", "crop"], curve_type="macro_micro", n_rows=4, n_cols=4)
3.3.2 Experiment 3-2: Ratios & Bias¶
exp3_2_hyperparams = {
"num_epochs": 10,
"lr": 5e-4,
"criterion": nn.CrossEntropyLoss(),
"optimizer": optim.AdamW,
"crop":0.6,
"angle":30,
"transform": A.Compose([
A.Normalize(mean=exp1_mean, std=exp1_std),
ToTensorV2()
])
}
class AddBiasTransform:
def __init__(self, bias: Union[int, Tuple[int, int]]) -> None:
if isinstance(bias, tuple):
self.bias1 = bias[0]
self.bias2 = bias[1]
else:
self.bias1 = 0
self.bias2 = bias
def __call__(self, img: np.ndarray) -> np.ndarray:
_dtype = img.dtype
bias_value = random.randint(self.bias1, self.bias2)
img = (img.astype(np.int16) + bias_value) % 256
return img.astype(_dtype)
# Group 2
candidate_ratios = [0.25, 0.42, 0.58, 0.75]
candidate_channel_biases = [0, 32, 64, 128]
Control candidates for different variables
def run_exp3_2(ratios, biases, hyper_params, train_dataset, valid_dataset, universal_train_dataset):
combinations = list(itertools.product(ratios, biases))
experiments = []
for i, combo in enumerate(combinations):
ratio, bias = combo
print(f"Running Exp {i+1}: ratio={ratio}, bias={bias}")
this_model = SmallVGG().to(device)
num_epochs = hyper_params['num_epochs']
lr = hyper_params['lr']
criterion = hyper_params['criterion']
optimizer = hyper_params['optimizer'](this_model.parameters(), lr=lr, weight_decay=0.01)
# Define Transform
this_mean, this_std = universal_train_dataset.get_meanstd(bias=bias)
this_transform = A.Compose([
A.Lambda(image=lambda img, **kwargs: AddBiasTransform(bias)(img)), # Lambda customized transform block
A.RandomResizedCrop(32, 32, scale=(hyper_params['crop'], 1.0), ratio=(ratio, 1.0 / ratio)),
A.Rotate(limit=hyper_params['angle']),
A.Normalize(mean=this_mean, std=this_std),
ToTensorV2()
])
# Generate Dataset
print(f"Exp {i+1}: Generating dataset from transform")
train_dataset.transform = this_transform
valid_dataset.transform = this_transform
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)
# Train Model
train_losses, valid_losses = train_and_evaluate(this_model,
train_loader,
valid_loader,
criterion,
optimizer,
num_epochs)
experiments.append({
"ratio": ratio,
"bias": bias,
"train_losses": train_losses,
"valid_losses": valid_losses,
"model_state_dict": this_model.state_dict()
})
del this_model, criterion, optimizer
del train_loader, valid_loader
torch.cuda.empty_cache()
return experiments
exp3_2 = run_exp3_2(candidate_ratios, candidate_channel_biases, exp3_2_hyperparams, exp3_train_dataset, exp3_valid_dataset, exp3_universal_train_dataset)
time_str = str(time.time()).replace(".", "")
torch.save(exp3_2, f"./models/exp3-2_{time_str}.pth")
exp3_2_loaded = torch.load("./models/exp3-2_17304509951687949.pth")
exp3_2_results = get_experiment_results(exp3_2_loaded, test_hyperparam_names=["ratio", "bias"], extra_loader=exp3_test_loader)
plot_el(exp3_2_loaded, ["ratio", "bias"], n_rows=4, n_cols=4)
plot_cm(exp3_2_results, ["ratio", "bias"], n_rows=4, n_cols=4)
exp3_2_accuracies, exp3_2_f1s = plot_pr(exp3_2_results, ["ratio", "bias"], n_rows=4, n_cols=4)
print_metrics(exp3_2_accuracies, exp3_2_f1s)
Accuracies: 0.888 0.889 0.861 0.757 0.889 0.870 0.857 0.815 0.868 0.874 0.852 0.853 0.878 0.878 0.829 0.799 F1 Score Lists: 0.876 0.916 0.932 0.864 0.911 0.892 0.852 0.860 0.821 0.847 | Avg F1=0.877, Std F1=0.03321812117840835 0.894 0.929 0.924 0.843 0.914 0.892 0.858 0.893 0.805 0.831 | Avg F1=0.878, Std F1=0.03979292054828506 0.838 0.901 0.898 0.811 0.883 0.869 0.820 0.870 0.767 0.835 | Avg F1=0.849, Std F1=0.04051631612264006 0.790 0.756 0.826 0.692 0.710 0.778 0.758 0.814 0.692 0.707 | Avg F1=0.752, Std F1=0.04753679473121518 0.883 0.925 0.929 0.849 0.918 0.892 0.864 0.875 0.803 0.828 | Avg F1=0.877, Std F1=0.0396059755138224 0.864 0.912 0.918 0.823 0.881 0.871 0.836 0.872 0.781 0.807 | Avg F1=0.857, Std F1=0.041990455912870366 0.866 0.876 0.911 0.813 0.872 0.872 0.840 0.836 0.768 0.816 | Avg F1=0.847, Std F1=0.03901193467163097 0.779 0.880 0.875 0.739 0.836 0.811 0.780 0.830 0.715 0.742 | Avg F1=0.799, Std F1=0.05436474612085296 0.876 0.910 0.913 0.815 0.891 0.874 0.825 0.874 0.773 0.805 | Avg F1=0.856, Std F1=0.04531640669527025 0.875 0.910 0.918 0.812 0.898 0.880 0.857 0.854 0.804 0.832 | Avg F1=0.864, Std F1=0.03745148690354468 0.852 0.893 0.901 0.782 0.875 0.859 0.830 0.860 0.764 0.791 | Avg F1=0.841, Std F1=0.04502402786643079 0.836 0.896 0.917 0.781 0.836 0.839 0.845 0.872 0.788 0.810 | Avg F1=0.842, Std F1=0.0414119020757317 0.866 0.904 0.927 0.846 0.904 0.886 0.852 0.857 0.809 0.804 | Avg F1=0.865, Std F1=0.038508310961775434 0.830 0.905 0.924 0.853 0.894 0.882 0.855 0.899 0.810 0.825 | Avg F1=0.868, Std F1=0.03653449685505532 0.802 0.875 0.887 0.775 0.845 0.824 0.796 0.826 0.729 0.775 | Avg F1=0.814, Std F1=0.04569527257505044 0.750 0.868 0.872 0.738 0.835 0.804 0.747 0.812 0.654 0.701 | Avg F1=0.778, Std F1=0.06818637211280652 Best: 2-th
plot_rocauc(exp3_2_results, ["ratio", "bias"], curve_type="all", n_rows=4, n_cols=4)
plot_rocauc(exp3_2_results, ["ratio", "bias"], curve_type="macro_micro", n_rows=4, n_cols=4)
candidate_epoch_num = [20, 40, 60, 80]
candidate_lr = [1e-3, 1e-4, 1e-5, 1e-6]
From the controlled variables, generate all the possible experiment set.
combinations = list(itertools.product(candidate_epoch_num, candidate_lr))
for combo in combinations:
print(f"[{combo[0]}, {combo[1]:.0e}]", end=" ")